This analysis examines diplomatic recognition patterns among countries from 1960 to 2020. We’ll analyze how many other countries recognize each country per year, track changes in recognition over time, and identify global recognition patterns.
First, let’s load the diplomatic representation data:
# Load the diplomatic data
diplomacy_data <- read_excel("/Users/yutianyi/Desktop/MA thesis data creation/Diplometrics Diplomatic Representation 1960-2020_20211215.xlsx",
sheet = "Data")
# Also load country codes reference for additional context
cow_continents <- read.csv("cow_continent_crosswalk_complete.csv")
states_info <- read.csv("states2016.csv")
nmc_data<-read.csv("/Users/yutianyi/Desktop/MA thesis data creation/NMC_Documentation-6.0/NMC-60-abridged/NMC-60-abridged.csv")
# Display the structure of the main dataset
str(diplomacy_data)## tibble [413,582 × 12] (S3: tbl_df/tbl/data.frame)
## $ Destination : chr [1:413582] "Afghanistan" "Afghanistan" "Afghanistan" "Afghanistan" ...
## $ Destination Region : chr [1:413582] "Asia" "Asia" "Asia" "Asia" ...
## $ Destination Subregion: chr [1:413582] "Southern Asia" "Southern Asia" "Southern Asia" "Southern Asia" ...
## $ Sending Country : chr [1:413582] "Czechoslovakia" "Egypt" "France" "Germany" ...
## $ Sending Region : chr [1:413582] "Europe" "Africa" "Europe" "Europe" ...
## $ Sending Subregion : chr [1:413582] "Eastern Europe" "Northern Africa" "Western Europe" "Western Europe" ...
## $ Year : num [1:413582] 1960 1960 1960 1960 1960 1960 1960 1960 1960 1960 ...
## $ Location : chr [1:413582] "Afghanistan" "Afghanistan" "Afghanistan" "Afghanistan" ...
## $ Embassy : num [1:413582] 5 6 6 6 6 4 6 5 6 6 ...
## $ Focus : num [1:413582] 1 1 1 1 1 1 1 1 1 1 ...
## $ EmbassyFocus : num [1:413582] 51 61 61 61 61 41 61 51 61 61 ...
## $ LOR : num [1:413582] 0.75 1 1 1 1 0.75 1 0.75 1 1 ...
## # A tibble: 6 × 12
## Destination `Destination Region` `Destination Subregion` `Sending Country`
## <chr> <chr> <chr> <chr>
## 1 Afghanistan Asia Southern Asia Czechoslovakia
## 2 Afghanistan Asia Southern Asia Egypt
## 3 Afghanistan Asia Southern Asia France
## 4 Afghanistan Asia Southern Asia Germany
## 5 Afghanistan Asia Southern Asia India
## 6 Afghanistan Asia Southern Asia Indonesia
## # ℹ 8 more variables: `Sending Region` <chr>, `Sending Subregion` <chr>,
## # Year <dbl>, Location <chr>, Embassy <dbl>, Focus <dbl>, EmbassyFocus <dbl>,
## # LOR <dbl>
Let’s transform the data to count recognitions per country per year:
# Count how many countries recognize each destination country per year
# We consider a country "recognized" if it has any diplomatic representation (Embassy > 0)
country_recognition <- diplomacy_data %>%
filter(!is.na(Embassy) & Embassy > 0) %>%
group_by(Destination, Year) %>%
summarize(
recognition_count = n_distinct(`Sending Country`),
avg_embassy_level = mean(Embassy, na.rm = TRUE),
avg_lor = mean(LOR, na.rm = TRUE),
.groups = "drop"
)
# Get the number of potential sending countries per year
# This helps normalize the recognition counts
total_senders_by_year <- diplomacy_data %>%
group_by(Year) %>%
summarize(
total_potential_senders = n_distinct(`Sending Country`),
.groups = "drop"
)
# Join the data to get recognition percentage
country_recognition <- country_recognition %>%
left_join(total_senders_by_year, by = "Year") %>%
mutate(recognition_percentage = (recognition_count / total_potential_senders) * 100)
# Display the transformed data
head(country_recognition)## # A tibble: 6 × 7
## Destination Year recognition_count avg_embassy_level avg_lor
## <chr> <dbl> <int> <dbl> <dbl>
## 1 Afghanistan 1960 16 5.31 0.906
## 2 Afghanistan 1961 18 5.22 0.889
## 3 Afghanistan 1962 18 5.94 0.931
## 4 Afghanistan 1963 13 1 0.75
## 5 Afghanistan 1964 14 1 0.75
## 6 Afghanistan 1965 14 1 0.75
## # ℹ 2 more variables: total_potential_senders <int>,
## # recognition_percentage <dbl>
Let’s examine how diplomatic recognition has trended over time:
# Calculate the average recognition percentage across all countries by year
overall_trend <- country_recognition %>%
group_by(Year) %>%
summarize(
avg_recognition_percentage = mean(recognition_percentage, na.rm = TRUE),
median_recognition_percentage = median(recognition_percentage, na.rm = TRUE),
countries_count = n_distinct(Destination),
.groups = "drop"
)
# Plot the overall trend
ggplot(overall_trend, aes(x = Year, y = avg_recognition_percentage)) +
geom_line(color = "blue", size = 1) +
geom_smooth(method = "loess", se = TRUE, color = "red", linetype = "dashed") +
labs(
title = "Average Diplomatic Recognition Over Time (1960-2020)",
subtitle = "Percentage of countries receiving diplomatic recognition",
x = "Year",
y = "Average Recognition Percentage",
caption = "Source: Diplometrics Dataset"
) +
theme_minimal() +
scale_y_continuous(labels = function(x) paste0(x, "%"))# Show the number of countries in the dataset over time
ggplot(overall_trend, aes(x = Year, y = countries_count)) +
geom_line(color = "darkgreen", size = 1) +
labs(
title = "Number of Countries in the Diplomatic Dataset (1960-2020)",
x = "Year",
y = "Number of Countries",
caption = "Source: Diplometrics Dataset"
) +
theme_minimal()Let’s look at recognition patterns for specific countries:
# Select a few countries of interest for detailed analysis
countries_of_interest <- c("United States", "Russia", "China", "India", "Brazil",
"South Africa", "Japan", "Germany", "France", "United Kingdom")
# Filter data for these countries
selected_countries <- country_recognition %>%
filter(Destination %in% countries_of_interest)
# Plot recognition trends for selected countries
ggplot(selected_countries, aes(x = Year, y = recognition_percentage, color = Destination)) +
geom_line(size = 1) +
labs(
title = "Diplomatic Recognition Trends for Selected Countries",
subtitle = "Percentage of countries providing diplomatic recognition",
x = "Year",
y = "Recognition Percentage",
color = "Country"
) +
theme_minimal() +
scale_y_continuous(labels = function(x) paste0(x, "%")) +
theme(legend.position = "right")Let’s examine recognition patterns by region using our continents data:
# Join with continents data to analyze by region
# First, create a simplified version of the cow_continents data
continent_mapping <- cow_continents %>%
select(statenme, continent) %>%
distinct()
# Rename to match our destination column
continent_mapping <- continent_mapping %>%
rename(Destination = statenme)
# Join with our recognition data
recognition_by_continent <- country_recognition %>%
left_join(continent_mapping, by = "Destination")
# Remove entries without continent information
recognition_by_continent <- recognition_by_continent %>%
filter(!is.na(continent))
# Calculate average recognition by continent and year
continent_summary <- recognition_by_continent %>%
group_by(continent, Year) %>%
summarize(
avg_recognition_percentage = mean(recognition_percentage, na.rm = TRUE),
countries_in_region = n(),
.groups = "drop"
)
# Plot regional recognition trends
ggplot(continent_summary, aes(x = Year, y = avg_recognition_percentage, color = continent)) +
geom_line(size = 1) +
labs(
title = "Average Diplomatic Recognition by Continent (1960-2020)",
subtitle = "Average percentage of countries receiving diplomatic recognition",
x = "Year",
y = "Average Recognition Percentage",
color = "Continent"
) +
theme_minimal() +
scale_y_continuous(labels = function(x) paste0(x, "%"))# Step 1: Use states2016.csv as a crosswalk to map country names to codes
# This gives us the ccode for each country in the recognition dataset
recognition_with_ccode <- country_recognition %>%
left_join(states_info %>% select(statenme, ccode),
by = c("Destination" = "statenme"))
# Examine which countries couldn't be matched (NA values)
unmatched_countries <- recognition_with_ccode %>%
filter(is.na(ccode)) %>%
select(Destination) %>%
distinct() %>%
arrange(Destination)
# View the count and list of unmatched countries
print(paste("Number of unmatched countries:", nrow(unmatched_countries)))## [1] "Number of unmatched countries: 23"
## [1] "Unmatched countries:"
## # A tibble: 23 × 1
## Destination
## <chr>
## 1 Antigua and Barbuda
## 2 Cabo Verde
## 3 Congo-Brazzaville
## 4 Congo-Kinshasa
## 5 Côte d’Ivoire
## 6 Gambia, The
## 7 Holy See (Vatican)
## 8 Korea, Democratic People's Republic of
## 9 Korea, Republic of
## 10 Lao People's Democratic Republic
## # ℹ 13 more rows
library(countrycode)
# Apply countrycode to standardize country names
recognition_with_ccode <- country_recognition %>%
# First convert Destination names to ISO country codes
mutate(
ccode = countrycode(
sourcevar = Destination,
origin = "country.name", # Input is country names
destination = "cown", # Convert to Correlates of War codes
warn = FALSE # Suppress warnings for unmatched countries
)
)
# Check which countries could not be matched
unmatched_countries <- recognition_with_ccode %>%
filter(is.na(ccode)) %>%
select(Destination) %>%
distinct() %>%
arrange(Destination)
# Print the results
print(paste("Number of countries matched:",
n_distinct(recognition_with_ccode$Destination) - nrow(unmatched_countries)))## [1] "Number of countries matched: 203"
## [1] "Number of unmatched countries: 2"
# After initial join with countrycode or states_info
recognition_with_ccode <- recognition_with_ccode %>%
mutate(
ccode = case_when(
Destination == "Serbia" ~ 345,
Destination == "Serbia and Montenegro" ~ 345,
TRUE ~ ccode
)
)
# Check that the replacement worked
serbia_check <- recognition_with_ccode %>%
filter(Destination %in% c("Serbia", "Serbia and Montenegro")) %>%
select(Destination, ccode) %>%
distinct()
print(serbia_check)## # A tibble: 2 × 2
## Destination ccode
## <chr> <dbl>
## 1 Serbia 345
## 2 Serbia and Montenegro 345
# After initial join with countrycode or states_info
recognition_with_ccode <- recognition_with_ccode %>%
mutate(
ccode = case_when(
Destination == "Serbia" ~ 345,
Destination == "Serbia and Montenegro" ~ 345,
TRUE ~ ccode
)
)
# Check that the replacement worked
serbia_check <- recognition_with_ccode %>%
filter(Destination %in% c("Serbia", "Serbia and Montenegro")) %>%
select(Destination, ccode) %>%
distinct()
print(serbia_check)## # A tibble: 2 × 2
## Destination ccode
## <chr> <dbl>
## 1 Serbia 345
## 2 Serbia and Montenegro 345
integrated_data <- recognition_with_ccode %>%
# Filter out any remaining NA codes (if you want to only keep matched countries)
filter(!is.na(ccode)) %>%
# Join with NMC data on both country code AND year
inner_join(
nmc_data %>% select(ccode, year, milex, milper, irst, pec, tpop, upop, cinc),
by = c("ccode" = "ccode", "Year" = "year")
)
# Check the result
print(paste("Number of rows in integrated dataset:", nrow(integrated_data)))## [1] "Number of rows in integrated dataset: 9124"
## [1] "Number of countries in integrated dataset: 204"
print(paste("Year range in integrated dataset:", min(integrated_data$Year), "to", max(integrated_data$Year)))## [1] "Year range in integrated dataset: 1960 to 2016"
## # A tibble: 6 × 15
## Destination Year recognition_count avg_embassy_level avg_lor
## <chr> <dbl> <int> <dbl> <dbl>
## 1 Afghanistan 1960 16 5.31 0.906
## 2 Afghanistan 1961 18 5.22 0.889
## 3 Afghanistan 1962 18 5.94 0.931
## 4 Afghanistan 1963 13 1 0.75
## 5 Afghanistan 1964 14 1 0.75
## 6 Afghanistan 1965 14 1 0.75
## # ℹ 10 more variables: total_potential_senders <int>,
## # recognition_percentage <dbl>, ccode <dbl>, milex <int>, milper <int>,
## # irst <int>, pec <int>, tpop <dbl>, upop <dbl>, cinc <dbl>
integrated_data_with_continent <- integrated_data %>%
# Use ccode for joining since both datasets have this field
left_join(
cow_continents %>% select(ccode, continent),
by = c("ccode" = "ccode")
)
# Check how many entries couldn't be matched with a continent
print(paste("Rows without continent information:",
sum(is.na(integrated_data_with_continent$continent))))## [1] "Rows without continent information: 9"
# If there are missing continents, examine which countries are missing
if(sum(is.na(integrated_data_with_continent$continent)) > 0) {
missing_continents <- integrated_data_with_continent %>%
filter(is.na(continent)) %>%
select(Destination, ccode) %>%
distinct()
print("Countries missing continent information:")
print(head(missing_continents, 10))
}## [1] "Countries missing continent information:"
## # A tibble: 1 × 2
## Destination ccode
## <chr> <dbl>
## 1 Kosovo 347
# Check the result
print(paste("Number of countries in final dataset:",
n_distinct(integrated_data_with_continent$Destination)))## [1] "Number of countries in final dataset: 204"
print(paste("Number of continents represented:",
n_distinct(integrated_data_with_continent$continent, na.rm = TRUE)))## [1] "Number of continents represented: 6"
# Assign "Europe" to Kosovo (ccode 347) where continent is NA
integrated_data_with_continent <- integrated_data_with_continent %>%
mutate(
continent = case_when(
ccode == 347 & is.na(continent) ~ "Europe",
TRUE ~ continent
)
)
# Check that Kosovo now has Europe assigned
kosovo_check <- integrated_data_with_continent %>%
filter(Destination == "Kosovo") %>%
select(Destination, ccode, continent) %>%
distinct()
print(kosovo_check)## # A tibble: 1 × 3
## Destination ccode continent
## <chr> <dbl> <chr>
## 1 Kosovo 347 Europe
# Display continent distribution
continent_distribution <- integrated_data_with_continent %>%
filter(!is.na(continent)) %>%
select(Destination, continent) %>%
distinct() %>%
group_by(continent) %>%
summarize(country_count = n()) %>%
arrange(desc(country_count))
print("Country distribution by continent:")## [1] "Country distribution by continent:"
## # A tibble: 6 × 2
## continent country_count
## <chr> <int>
## 1 Africa 54
## 2 Asia 51
## 3 Europe 50
## 4 North America 23
## 5 Oceania 14
## 6 South America 12
## # A tibble: 6 × 16
## Destination Year recognition_count avg_embassy_level avg_lor
## <chr> <dbl> <int> <dbl> <dbl>
## 1 Afghanistan 1960 16 5.31 0.906
## 2 Afghanistan 1961 18 5.22 0.889
## 3 Afghanistan 1962 18 5.94 0.931
## 4 Afghanistan 1963 13 1 0.75
## 5 Afghanistan 1964 14 1 0.75
## 6 Afghanistan 1965 14 1 0.75
## # ℹ 11 more variables: total_potential_senders <int>,
## # recognition_percentage <dbl>, ccode <dbl>, milex <int>, milper <int>,
## # irst <int>, pec <int>, tpop <dbl>, upop <dbl>, cinc <dbl>, continent <chr>
# Load the Polity5 data
polity5_data <- read_excel("p5v2018 2.xls", sheet = "p5v2018")
# Filter to only include years 1960-2016
polity5_filtered <- polity5_data %>%
filter(year >= 1960 & year <= 2016)
# Check the result
print(paste("Original Polity5 data rows:", nrow(polity5_data)))## [1] "Original Polity5 data rows: 17574"
## [1] "Filtered Polity5 data rows (1960-2016): 8464"
## [1] "Number of countries in filtered data: 180"
print(paste("Year range in filtered data:", min(polity5_filtered$year), "to", max(polity5_filtered$year)))## [1] "Year range in filtered data: 1960 to 2016"
# View the key variables we might be interested in
polity5_summary <- polity5_filtered %>%
select(ccode, country, year, democ, autoc, polity, polity2, durable, xconst) %>%
head(10)
print("Sample of filtered Polity5 data:")## [1] "Sample of filtered Polity5 data:"
## # A tibble: 10 × 9
## ccode country year democ autoc polity polity2 durable xconst
## <dbl> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 700 Afghanistan 1960 0 10 -10 -10 NA 1
## 2 700 Afghanistan 1961 0 10 -10 -10 NA 1
## 3 700 Afghanistan 1962 0 10 -10 -10 NA 1
## 4 700 Afghanistan 1963 0 10 -10 -10 NA 1
## 5 700 Afghanistan 1964 0 7 -7 -7 0 3
## 6 700 Afghanistan 1965 0 7 -7 -7 1 3
## 7 700 Afghanistan 1966 0 7 -7 -7 2 3
## 8 700 Afghanistan 1967 0 7 -7 -7 3 3
## 9 700 Afghanistan 1968 0 7 -7 -7 4 3
## 10 700 Afghanistan 1969 0 7 -7 -7 5 3
# Join Polity5 data with the integrated data
fully_integrated_data <- integrated_data_with_continent %>%
left_join(
polity5_filtered %>%
select(ccode, year, democ, autoc, polity, polity2, durable, xconst),
by = c("ccode" = "ccode", "Year" = "year")
)
# Check the join results
print(paste("Rows in fully integrated dataset:", nrow(fully_integrated_data)))## [1] "Rows in fully integrated dataset: 9124"
## [1] "Rows with Polity data: 8120"
print(paste("Percentage with Polity data:",
round(mean(!is.na(fully_integrated_data$polity)) * 100, 1), "%"))## [1] "Percentage with Polity data: 89 %"
# The final fully integrated dataset now includes:
# - Recognition data
# - Material capabilities data (NMC)
# - Continent information
# - Regime characteristics (Polity5)
head(fully_integrated_data)## # A tibble: 6 × 22
## Destination Year recognition_count avg_embassy_level avg_lor
## <chr> <dbl> <int> <dbl> <dbl>
## 1 Afghanistan 1960 16 5.31 0.906
## 2 Afghanistan 1961 18 5.22 0.889
## 3 Afghanistan 1962 18 5.94 0.931
## 4 Afghanistan 1963 13 1 0.75
## 5 Afghanistan 1964 14 1 0.75
## 6 Afghanistan 1965 14 1 0.75
## # ℹ 17 more variables: total_potential_senders <int>,
## # recognition_percentage <dbl>, ccode <dbl>, milex <int>, milper <int>,
## # irst <int>, pec <int>, tpop <dbl>, upop <dbl>, cinc <dbl>, continent <chr>,
## # democ <dbl>, autoc <dbl>, polity <dbl>, polity2 <dbl>, durable <dbl>,
## # xconst <dbl>
# Find countries where all polity data is NA across all years
countries_missing_polity <- fully_integrated_data %>%
# Group by country
group_by(Destination, ccode) %>%
# Check if ALL rows for this country have NA for all polity variables
summarize(
row_count = n(),
all_polity_missing = all(is.na(democ) & is.na(autoc) & is.na(polity) &
is.na(polity2) & is.na(durable) & is.na(xconst)),
.groups = "drop"
) %>%
# Keep only countries where all polity data is missing
filter(all_polity_missing == TRUE) %>%
# Sort by country name
arrange(Destination)
# Print the results
print(paste("Number of countries with no polity data:", nrow(countries_missing_polity)))## [1] "Number of countries with no polity data: 32"
## [1] "Countries missing all polity data:"
## # A tibble: 32 × 4
## Destination ccode row_count all_polity_missing
## <chr> <dbl> <int> <lgl>
## 1 Andorra 232 23 TRUE
## 2 Antigua and Barbuda 58 32 TRUE
## 3 Bahamas 31 41 TRUE
## 4 Barbados 53 50 TRUE
## 5 Belize 80 34 TRUE
## 6 Brunei 835 33 TRUE
## 7 Dominica 54 30 TRUE
## 8 Grenada 55 37 TRUE
## 9 Iceland 395 57 TRUE
## 10 Kiribati 946 18 TRUE
## # ℹ 22 more rows
# Alternatively, you could check which states have ANY polity data
countries_with_polity <- fully_integrated_data %>%
group_by(Destination, ccode) %>%
summarize(
has_any_polity_data = any(!is.na(polity) | !is.na(polity2)),
.groups = "drop"
) %>%
filter(has_any_polity_data == FALSE)
# Compare the counts
print(paste("Number of countries with no polity or polity2 data:",
nrow(countries_with_polity)))## [1] "Number of countries with no polity or polity2 data: 32"
# First, let's check the year ranges for each Vietnam entity
vietnam_data <- fully_integrated_data %>%
filter(ccode %in% c(816, 817, 818)) %>%
group_by(Destination, ccode) %>%
summarize(
min_year = min(Year),
max_year = max(Year),
n_observations = n(),
.groups = "drop"
)
print("Time periods for Vietnam entities:")## [1] "Time periods for Vietnam entities:"
## # A tibble: 3 × 5
## Destination ccode min_year max_year n_observations
## <chr> <dbl> <dbl> <dbl> <int>
## 1 Viet Nam 816 1977 2016 40
## 2 Viet Nam, Democratic Republic of 816 1960 1976 5
## 3 Viet Nam, Republic of 816 1960 1974 15
# Replace ccodes for Vietnam entities with their specific codes
fully_integrated_data <- fully_integrated_data %>%
mutate(
ccode = case_when(
Destination == "Viet Nam" ~ 818,
Destination == "Viet Nam, Democratic Republic of" ~ 816,
Destination == "Viet Nam, Republic of" ~ 817,
TRUE ~ ccode
)
)# Step 1: Identify states where all Polity 5 variables are NA
states_missing_polity <- fully_integrated_data %>%
group_by(Destination, ccode) %>%
summarize(
row_count = n(),
all_polity_missing = all(is.na(polity) & is.na(polity2) & is.na(democ) &
is.na(autoc) & is.na(durable) & is.na(xconst)),
.groups = "drop"
) %>%
filter(all_polity_missing == TRUE) %>%
arrange(Destination)
# Print summary
print(paste("Number of states with all Polity variables missing:", nrow(states_missing_polity)))## [1] "Number of states with all Polity variables missing: 32"
# Step 2: Check if these states exist in polity5_filtered by ccode
polity5_ccodes <- unique(polity5_filtered$ccode)
polity5_countries <- unique(polity5_filtered$country)
# Add columns to indicate if the state exists in polity5_filtered
states_missing_polity <- states_missing_polity %>%
mutate(
exists_by_ccode = ccode %in% polity5_ccodes,
# Create a standardized name column to help with matching
dest_standardized = gsub("[^[:alnum:]]", "", tolower(Destination))
)
# Create standardized country names for polity5 data
polity5_countries_std <- gsub("[^[:alnum:]]", "", tolower(polity5_countries))
# Check if they exist by name (approximate matching)
states_missing_polity <- states_missing_polity %>%
mutate(
exists_by_name = dest_standardized %in% polity5_countries_std
) %>%
select(-dest_standardized) # Remove the temporary column# Step 3: Categorize the results
states_missing_polity <- states_missing_polity %>%
mutate(
status = case_when(
exists_by_ccode ~ "In Polity5 (matched by ccode)",
exists_by_name ~ "In Polity5 (matched by name, but ccode differs)",
TRUE ~ "Not in Polity5 dataset"
)
)
# Print the detailed results
print("States missing Polity data and their status:")## [1] "States missing Polity data and their status:"
## # A tibble: 32 × 7
## Destination ccode row_count all_polity_missing exists_by_ccode exists_by_name
## <chr> <dbl> <int> <lgl> <lgl> <lgl>
## 1 Andorra 232 23 TRUE FALSE FALSE
## 2 Antigua an… 58 32 TRUE FALSE FALSE
## 3 Bahamas 31 41 TRUE FALSE FALSE
## 4 Barbados 53 50 TRUE FALSE FALSE
## 5 Belize 80 34 TRUE FALSE FALSE
## 6 Brunei 835 33 TRUE FALSE FALSE
## 7 Dominica 54 30 TRUE FALSE FALSE
## 8 Grenada 55 37 TRUE FALSE FALSE
## 9 Iceland 395 57 TRUE FALSE FALSE
## 10 Kiribati 946 18 TRUE FALSE FALSE
## # ℹ 22 more rows
## # ℹ 1 more variable: status <chr>
# Step 4: Summarize the findings
summary_table <- states_missing_polity %>%
group_by(status) %>%
summarize(
count = n(),
.groups = "drop"
)
print("Summary of findings:")## [1] "Summary of findings:"
## # A tibble: 2 × 2
## status count
## <chr> <int>
## 1 In Polity5 (matched by ccode) 4
## 2 Not in Polity5 dataset 28
# Step 5: For states that exist in Polity5 but have missing data in the integrated dataset,
# Check their actual ccodes in polity5_filtered to identify potential mismatches
mismatched_states <- states_missing_polity %>%
filter(exists_by_ccode) %>%
select(Destination, ccode)
if(nrow(mismatched_states) > 0) {
print("States that exist in Polity5 but have missing data in the integrated dataset:")
for(i in 1:nrow(mismatched_states)) {
state_ccode <- mismatched_states$ccode[i]
polity_years <- polity5_filtered %>%
filter(ccode == state_ccode) %>%
summarize(
min_year = min(year),
max_year = max(year),
country_name = first(country)
)
print(paste0("- ", mismatched_states$Destination[i],
" (ccode ", state_ccode, "): ",
"In Polity5 as '", polity_years$country_name, "' ",
"from ", polity_years$min_year,
" to ", polity_years$max_year))
}
}## [1] "States that exist in Polity5 but have missing data in the integrated dataset:"
## [1] "- Kosovo (ccode 347): In Polity5 as 'Yugoslavia' from 1991 to 2006"
## [1] "- Serbia (ccode 345): In Polity5 as 'Yugoslavia' from 1960 to 1991"
## [1] "- Serbia and Montenegro (ccode 345): In Polity5 as 'Yugoslavia' from 1960 to 1991"
## [1] "- Viet Nam (ccode 818): In Polity5 as 'Vietnam' from 1976 to 2016"
# Update the ccodes for Kosovo, Serbia, and Serbia and Montenegro
fully_integrated_data <- fully_integrated_data %>%
mutate(
ccode = case_when(
Destination == "Kosovo" ~ 341,
Destination == "Serbia" ~ 342,
Destination == "Serbia and Montenegro" ~ 347,
TRUE ~ ccode
)
)
# Verify the changes were applied correctly
verification <- fully_integrated_data %>%
filter(Destination %in% c("Kosovo", "Serbia", "Serbia and Montenegro")) %>%
select(Destination, ccode) %>%
distinct()
print("Updated country codes:")## [1] "Updated country codes:"
## # A tibble: 3 × 2
## Destination ccode
## <chr> <dbl>
## 1 Kosovo 341
## 2 Serbia 342
## 3 Serbia and Montenegro 347
# First, remove the existing Polity5 variables from fully_integrated_data
# This prevents duplicate columns when we join again
fully_integrated_data <- fully_integrated_data %>%
select(-democ, -autoc, -polity, -polity2, -durable, -xconst)
# Now rejoin with polity5_filtered using the updated country codes
fully_integrated_data_updated <- fully_integrated_data %>%
left_join(
polity5_filtered %>%
select(ccode, year, democ, autoc, polity, polity2, durable, xconst),
by = c("ccode" = "ccode", "Year" = "year")
)
# Check how many rows have Polity data now
print(paste("Total rows in dataset:", nrow(fully_integrated_data_updated)))## [1] "Total rows in dataset: 9124"
## [1] "Rows with Polity data: 8194"
print(paste("Percentage with Polity data:",
round(mean(!is.na(fully_integrated_data_updated$polity)) * 100, 1), "%"))## [1] "Percentage with Polity data: 89.8 %"
# Check specifically for Kosovo, Serbia, and Serbia and Montenegro
balkan_states <- fully_integrated_data_updated %>%
filter(Destination %in% c("Kosovo", "Serbia", "Serbia and Montenegro")) %>%
group_by(Destination, ccode) %>%
summarize(
total_rows = n(),
rows_with_polity = sum(!is.na(polity)),
percentage_with_polity = round(mean(!is.na(polity)) * 100, 1),
.groups = "drop"
)
print("Polity data for Balkan states after update:")## [1] "Polity data for Balkan states after update:"
## # A tibble: 3 × 5
## Destination ccode total_rows rows_with_polity percentage_with_polity
## <chr> <dbl> <int> <int> <dbl>
## 1 Kosovo 341 9 9 100
## 2 Serbia 342 10 10 100
## 3 Serbia and Montenegro 347 15 15 100
# Replace the old dataset with the updated one
fully_integrated_data <- fully_integrated_data_updated# First, identify which states have all Polity 5 variables missing
states_with_missing_polity <- fully_integrated_data %>%
group_by(Destination, ccode) %>%
summarize(
all_polity_missing = all(is.na(polity) & is.na(polity2) & is.na(democ) &
is.na(autoc) & is.na(durable) & is.na(xconst)),
.groups = "drop"
) %>%
filter(all_polity_missing == TRUE)
# Get the list of states to filter out
states_to_remove <- states_with_missing_polity %>%
select(Destination) %>%
pull()
# Print info about what we're filtering
print(paste("Filtering out", length(states_to_remove), "states with missing Polity data"))## [1] "Filtering out 28 states with missing Polity data"
## [1] "States being removed:"
## [1] "Andorra" "Antigua and Barbuda"
## [3] "Bahamas" "Barbados"
## [5] "Belize" "Brunei"
## [7] "Dominica" "Grenada"
## [9] "Iceland" "Kiribati"
## [11] "Liechtenstein" "Maldives"
## [13] "Malta" "Marshall Islands"
## [15] "Micronesia, Federated States of" "Monaco"
## [17] "Nauru" "Palau"
## [19] "Saint Kitts and Nevis" "Saint Lucia"
## [21] "Saint Vincent and the Grenadines" "Samoa"
## [23] "San Marino" "Sao Tome and Principe"
## [25] "Seychelles" "Tonga"
## [27] "Tuvalu" "Vanuatu"
# Create the filtered dataset
fully_integrated_data_filtered <- fully_integrated_data %>%
filter(!(Destination %in% states_to_remove))
# Check the results
print(paste("Original dataset rows:", nrow(fully_integrated_data)))## [1] "Original dataset rows: 9124"
## [1] "Filtered dataset rows: 8276"
## [1] "Rows removed: 848"
# Check that all remaining states have at least some Polity data
remaining_missing_check <- fully_integrated_data_filtered %>%
group_by(Destination) %>%
summarize(
all_polity_still_missing = all(is.na(polity)),
.groups = "drop"
) %>%
filter(all_polity_still_missing == TRUE)
print(paste("States remaining with all polity still missing:", nrow(remaining_missing_check)))## [1] "States remaining with all polity still missing: 0"
# Use this filtered dataset for further analysis
fully_integrated_data <- fully_integrated_data_filtered## [1] "Destination" "Year"
## [3] "recognition_count" "avg_embassy_level"
## [5] "avg_lor" "total_potential_senders"
## [7] "recognition_percentage" "ccode"
## [9] "milex" "milper"
## [11] "irst" "pec"
## [13] "tpop" "upop"
## [15] "cinc" "continent"
## [17] "democ" "autoc"
## [19] "polity" "polity2"
## [21] "durable" "xconst"
## tibble [8,276 × 22] (S3: tbl_df/tbl/data.frame)
## $ Destination : chr [1:8276] "Afghanistan" "Afghanistan" "Afghanistan" "Afghanistan" ...
## $ Year : num [1:8276] 1960 1961 1962 1963 1964 ...
## $ recognition_count : int [1:8276] 16 18 18 13 14 14 14 14 22 22 ...
## $ avg_embassy_level : num [1:8276] 5.31 5.22 5.94 1 1 ...
## $ avg_lor : num [1:8276] 0.906 0.889 0.931 0.75 0.75 ...
## $ total_potential_senders: int [1:8276] 94 114 116 118 126 127 128 133 134 137 ...
## $ recognition_percentage : num [1:8276] 17 15.8 15.5 11 11.1 ...
## $ ccode : num [1:8276] 700 700 700 700 700 700 700 700 700 700 ...
## $ milex : int [1:8276] 14240 14184 13125 14795 14544 13865 16819 17932 20067 20915 ...
## $ milper : int [1:8276] 60 110 104 110 112 97 97 95 89 89 ...
## $ irst : int [1:8276] 0 0 0 0 0 0 0 0 0 0 ...
## $ pec : int [1:8276] 48 66 112 99 113 144 162 331 405 181 ...
## $ tpop : num [1:8276] 10775 11014 11262 11521 11791 ...
## $ upop : num [1:8276] 444 462 481 500 478 456 434 412 448 483 ...
## $ cinc : num [1:8276] 0.00131 0.00176 0.00166 0.00166 0.00164 ...
## $ continent : chr [1:8276] "Asia" "Asia" "Asia" "Asia" ...
## $ democ : num [1:8276] 0 0 0 0 0 0 0 0 0 0 ...
## $ autoc : num [1:8276] 10 10 10 10 7 7 7 7 7 7 ...
## $ polity : num [1:8276] -10 -10 -10 -10 -7 -7 -7 -7 -7 -7 ...
## $ polity2 : num [1:8276] -10 -10 -10 -10 -7 -7 -7 -7 -7 -7 ...
## $ durable : num [1:8276] NA NA NA NA 0 1 2 3 4 5 ...
## $ xconst : num [1:8276] 1 1 1 1 3 3 3 3 3 3 ...